{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "'2.2.0'"
     },
     "metadata": {},
     "execution_count": 2
    }
   ],
   "source": [
    "import numpy as np\n",
    "import sys\n",
    "sys.path\n",
    "import tensorflow as tf\n",
    "\n",
    "import os\n",
    "os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"\n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"3\" \n",
    "tf.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "samples = ['The cat sat on the mat.', 'The dog ate my homework.']\n",
    "\n",
    "token_index = {}\n",
    "for sample in samples:\n",
    "    for word in sample.split():\n",
    "        if word not in token_index:\n",
    "            token_index[word] = len(token_index) + 1\n",
    "max_length=10\n",
    "results = np.zeros(shape=(len(samples), max_length, max(token_index.values())+1))\n",
    "\n",
    "for i, sample in enumerate(samples):\n",
    "    for j, word in list(enumerate(sample.split()))[:max_length]:\n",
    "        index = token_index.get(word)\n",
    "        results[i, j, index] = 1."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "array([[[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],\n        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],\n        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],\n        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],\n        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],\n        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],\n\n       [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],\n        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],\n        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],\n        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],\n        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]])"
     },
     "metadata": {},
     "execution_count": 4
    }
   ],
   "source": [
    "results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "\n",
    "from tensorflow.keras.preprocessing.text import Tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "[[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9]]\n[[0. 1. 1. ... 0. 0. 0.]\n [0. 1. 0. ... 0. 0. 0.]]\n"
    }
   ],
   "source": [
    "tokenizer = Tokenizer(num_words=1000)\n",
    "tokenizer.fit_on_texts(samples)\n",
    "sequences = tokenizer.texts_to_sequences(samples)\n",
    "print(sequences)\n",
    "one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')\n",
    "print(one_hot_results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tensorflow.keras.layers import Embedding\n",
    "\n",
    "embedding_layer = Embedding(1000, 64)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "(25000, 30)"
     },
     "metadata": {},
     "execution_count": 8
    }
   ],
   "source": [
    "from tensorflow.keras.datasets import imdb\n",
    "from tensorflow.keras.preprocessing import sequence\n",
    "\n",
    "max_features = 10000\n",
    "maxlen = 30\n",
    "\n",
    "(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)\n",
    "x_train = sequence.pad_sequences(x_train, maxlen=maxlen)\n",
    "x_test = sequence.pad_sequences(x_test, maxlen=maxlen)\n",
    "x_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "Model: \"sequential\"\n_________________________________________________________________\nLayer (type)                 Output Shape              Param #   \n=================================================================\nembedding_1 (Embedding)      (None, 30, 8)             80000     \n_________________________________________________________________\nflatten (Flatten)            (None, 240)               0         \n_________________________________________________________________\ndense (Dense)                (None, 32)                7712      \n_________________________________________________________________\ndense_1 (Dense)              (None, 1)                 33        \n=================================================================\nTotal params: 87,745\nTrainable params: 87,745\nNon-trainable params: 0\n_________________________________________________________________\n"
    }
   ],
   "source": [
    "from tensorflow.keras.models import Sequential\n",
    "from tensorflow.keras.layers import Flatten, Dense, Embedding\n",
    "\n",
    "model = Sequential()\n",
    "model.add(Embedding(10000, 8, input_length=maxlen))\n",
    "model.add(Flatten())\n",
    "model.add(Dense(32, activation='relu'))\n",
    "model.add(Dense(1, activation='sigmoid'))\n",
    "model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "Epoch 1/10\n625/625 [==============================] - 3s 5ms/step - loss: 0.5991 - acc: 0.6654 - val_loss: 0.4886 - val_acc: 0.7582\nEpoch 2/10\n625/625 [==============================] - 3s 4ms/step - loss: 0.4139 - acc: 0.8082 - val_loss: 0.4639 - val_acc: 0.7756\nEpoch 3/10\n625/625 [==============================] - 3s 4ms/step - loss: 0.3515 - acc: 0.8461 - val_loss: 0.4709 - val_acc: 0.7774\nEpoch 4/10\n625/625 [==============================] - 3s 4ms/step - loss: 0.2995 - acc: 0.8730 - val_loss: 0.4944 - val_acc: 0.7764\nEpoch 5/10\n625/625 [==============================] - 3s 4ms/step - loss: 0.2486 - acc: 0.9013 - val_loss: 0.5401 - val_acc: 0.7680\nEpoch 6/10\n625/625 [==============================] - 3s 4ms/step - loss: 0.2042 - acc: 0.9222 - val_loss: 0.5927 - val_acc: 0.7596\nEpoch 7/10\n625/625 [==============================] - 3s 4ms/step - loss: 0.1653 - acc: 0.9395 - val_loss: 0.6475 - val_acc: 0.7504\nEpoch 8/10\n625/625 [==============================] - 3s 4ms/step - loss: 0.1324 - acc: 0.9535 - val_loss: 0.7204 - val_acc: 0.7422\nEpoch 9/10\n625/625 [==============================] - 3s 4ms/step - loss: 0.1062 - acc: 0.9637 - val_loss: 0.8180 - val_acc: 0.7462\nEpoch 10/10\n625/625 [==============================] - 3s 4ms/step - loss: 0.0844 - acc: 0.9717 - val_loss: 0.8589 - val_acc: 0.7356\n"
    }
   ],
   "source": [
    "history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_split=0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "Epoch 1/10\n157/157 [==============================] - 2s 13ms/step - loss: 0.5595 - acc: 0.7074 - val_loss: 0.4980 - val_acc: 0.7630\nEpoch 2/10\n157/157 [==============================] - 2s 11ms/step - loss: 0.4081 - acc: 0.8166 - val_loss: 0.4562 - val_acc: 0.7732\nEpoch 3/10\n157/157 [==============================] - 2s 11ms/step - loss: 0.3571 - acc: 0.8439 - val_loss: 0.4591 - val_acc: 0.7788\nEpoch 4/10\n157/157 [==============================] - 2s 11ms/step - loss: 0.3308 - acc: 0.8602 - val_loss: 0.4623 - val_acc: 0.7746\nEpoch 5/10\n157/157 [==============================] - 2s 11ms/step - loss: 0.3100 - acc: 0.8684 - val_loss: 0.4898 - val_acc: 0.7734\nEpoch 6/10\n157/157 [==============================] - 2s 11ms/step - loss: 0.2927 - acc: 0.8784 - val_loss: 0.4844 - val_acc: 0.7664\nEpoch 7/10\n157/157 [==============================] - 2s 10ms/step - loss: 0.2780 - acc: 0.8839 - val_loss: 0.5090 - val_acc: 0.7598\nEpoch 8/10\n157/157 [==============================] - 2s 11ms/step - loss: 0.2621 - acc: 0.8942 - val_loss: 0.5289 - val_acc: 0.7672\nEpoch 9/10\n157/157 [==============================] - 2s 11ms/step - loss: 0.2456 - acc: 0.9002 - val_loss: 0.5466 - val_acc: 0.7612\nEpoch 10/10\n157/157 [==============================] - 2s 11ms/step - loss: 0.2295 - acc: 0.9097 - val_loss: 0.5781 - val_acc: 0.7604\n"
    }
   ],
   "source": [
    "from tensorflow.keras.layers import LSTM\n",
    "\n",
    "model = Sequential()\n",
    "model.add(Embedding(max_features, 32))\n",
    "model.add(LSTM(32))\n",
    "model.add(Dense(1, activation='sigmoid'))\n",
    "\n",
    "model.compile(optimizer='rmsprop',\n",
    "              loss='binary_crossentropy',\n",
    "              metrics=['acc'])\n",
    "history = model.fit(x_train, y_train,\n",
    "                    epochs=10,\n",
    "                    batch_size=128,\n",
    "                    validation_split=0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "imdb_dir = '/home/lilanqing/download/aclImdb'\n",
    "train_dir = os.path.join(imdb_dir, 'train')\n",
    "\n",
    "labels = []\n",
    "texts = []\n",
    "\n",
    "for label_type in ['neg', 'pos']:\n",
    "    dir_name = os.path.join(train_dir, label_type)\n",
    "    for fname in os.listdir(dir_name):\n",
    "        if fname[-4:] == '.txt':\n",
    "            f = open(os.path.join(dir_name, fname))\n",
    "            texts.append(f.read())\n",
    "            f.close()\n",
    "            if label_type == 'neg':\n",
    "                labels.append(0)\n",
    "            else:\n",
    "                labels.append(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tensorflow.keras.preprocessing.text import Tokenizer\n",
    "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "maxlen = 100\n",
    "training_samples = 20000\n",
    "validation_samples = 10000\n",
    "max_words = 10000\n",
    "\n",
    "tokenizer = Tokenizer(num_words=max_words)\n",
    "tokenizer.fit_on_texts(texts)\n",
    "sequences = tokenizer.texts_to_sequences(texts)\n",
    "\n",
    "word_index = tokenizer.word_index\n",
    "print('Found %s unique tokens.' % len(word_index))\n",
    "\n",
    "data = pad_sequences(sequences, maxlen=maxlen)\n",
    "\n",
    "labels = np.asarray(labels)\n",
    "print(data.shape)\n",
    "print(labels.shape)\n",
    "\n",
    "indices = np.arange(data.shape[0])\n",
    "np.random.shuffle(indices)\n",
    "data = data[indices]\n",
    "labels = labels[indices]\n",
    "\n",
    "x_train = data[:training_samples]\n",
    "y_train = labels[:training_samples]\n",
    "x_val = data[training_samples:training_samples+validation_samples]\n",
    "y_val = data[training_samples:training_samples+validation_samples]\n",
    "print(x_train[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tensorflow.keras.models import Sequential\n",
    "from tensorflow.keras.layers import Embedding, Flatten, Dense"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "embedding_dim = 100\n",
    "model = Sequential()\n",
    "model.add(Embedding(max_words, embedding_dim, input_length=maxlen))\n",
    "model.add(Flatten())\n",
    "model.add(Dense(32, activation='relu'))\n",
    "model.add(Dense(1, activation='sigmoid'))\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])\n",
    "history = model.fit(x_train, y_train,epochs=10,batch_size=32,validation_split=0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tensorflow.keras.models import Sequential\n",
    "from tensorflow.keras.layers import Embedding, SimpleRNN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'Sequential' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-1-58a280e51a93>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSequential\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mEmbedding\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10000\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m32\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSimpleRNN\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m32\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msummary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'Sequential' is not defined"
     ]
    }
   ],
   "source": [
    "model = Sequential()\n",
    "model.add(Embedding(10000, 32))\n",
    "model.add(SimpleRNN(32))\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "hide_input": false,
  "kernelspec": {
   "display_name": "Python 3.8.3 64-bit",
   "language": "python",
   "name": "python38364bit022f2b83e45b4f599a9763bd4e737d47"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3-final"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}